{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "import torchvision\n",
    "import numpy as np\n",
    "from keras.preprocessing.image import load_img\n",
    "import torchvision.transforms as T\n",
    "\n",
    "model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)\n",
    "model.eval()\n",
    "\n",
    "img_path = '2007_000032.jpg'\n",
    "img = load_img(img_path, target_size=(800, 800))\n",
    "transform = T.Compose([T.ToTensor()])  # Defing PyTorch Transform\n",
    "img = transform(img)  # Apply the transform to the image\n",
    "\n",
    "images = [img]\n",
    "targets=None\n",
    "original_image_sizes = [img.shape[-2:] for img in images]\n",
    "images, targets = model.transform(images, targets)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "提取特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = model.backbone(images.tensors)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "提取proposals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1000, 4])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "proposals, proposal_losses = model.rpn(images, features, targets)\n",
    "proposals[0].size()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "proposals的大小是1000，我们随机选取一个进行计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([257.4335, 268.6282, 648.8815, 444.5958])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "proposals[0][76]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们原始图片大小是800x800，特征大小是50x50，所以缩放比例是16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[torch.Size([800, 800])]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "image_shapes = images.image_sizes\n",
    "image_shapes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 256, 50, 50])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "third_level_feature = features[2]        \n",
    "third_level_feature.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[16.08959375, 16.7892625, 40.55509375, 27.7872375]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[x/16 for x in [257.4335, 268.6282, 648.8815, 444.5958]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "现在我们在特征上的区域是"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "rois_in_feature = torch.Tensor([[0, 16.0896, 16.7893, 40.5551, 27.7872]]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(0.0741, grad_fn=<SelectBackward>)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "third_level_feature[0][0][16][16] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "roi_start_w = 16\n",
    "roi_start_h = 17\n",
    "roi_end_w = 41\n",
    "roi_end_h = 28\n",
    "\n",
    "roi_width = (roi_end_w - roi_start_w + 1)\n",
    "roi_height = (roi_end_h - roi_start_h + 1)\n",
    "\n",
    "bin_size_w = roi_width / 7\n",
    "bin_size_h = roi_height / 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[tensor(0.4245, grad_fn=<MaxBackward1>),\n",
       " tensor(0.4636, grad_fn=<MaxBackward1>),\n",
       " tensor(0.3529, grad_fn=<MaxBackward1>),\n",
       " tensor(0.0751, grad_fn=<MaxBackward1>),\n",
       " tensor(0.3588, grad_fn=<MaxBackward1>),\n",
       " tensor(0.6445, grad_fn=<MaxBackward1>),\n",
       " tensor(0.0492, grad_fn=<MaxBackward1>)]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import math\n",
    "self_result = []\n",
    "\n",
    "for j in range(7):\n",
    "    sub_res = []\n",
    "    for i in range(7):\n",
    "        sub_res.append(third_level_feature[..., int(17+bin_size_h*(j)):math.ceil(17+bin_size_h*(j+1)), int(16+bin_size_w*(i)):math.ceil(16+bin_size_w*(i+1))][0][0].max())\n",
    "    self_result.append(sub_res)\n",
    "self_result[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们用torchvision的roi pooling检验下上面的实现是否正确"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[ 0.4245,  0.4636,  0.3529,  0.0751,  0.3588,  0.6445,  0.0492],\n",
       "        [ 1.1585,  0.3858,  0.3529,  0.0751,  0.3801,  0.9784,  0.6303],\n",
       "        [ 0.8688,  0.2752,  0.2927, -0.1014,  0.2978,  0.9784,  0.6303],\n",
       "        [-0.3445, -0.1203, -0.4138, -0.6199, -0.0419,  0.5004,  0.3759],\n",
       "        [ 0.1199,  0.1690,  0.9712,  0.9712,  0.6120,  0.6155,  0.5975],\n",
       "        [ 0.8110,  0.5443,  1.2172,  1.2172,  0.8078,  1.0437,  1.3142],\n",
       "        [ 0.8775,  0.6454,  1.1695,  1.1637,  0.6355,  1.2764,  1.3474]],\n",
       "       grad_fn=<SelectBackward>)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from torchvision.ops import roi_pool\n",
    "result = roi_pool(\n",
    "                third_level_feature, rois_in_feature,\n",
    "                output_size=(7, 7)\n",
    "            )\n",
    "result[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
